Examples
Comparison with MathOptInterface on a Probability Simplex
In this example, we project a random point onto a probability simplex with the Frank-Wolfe algorithm using either the specialized LMO defined in the package or a generic LP formulation using MathOptInterface.jl (MOI) and GLPK as underlying LP solver. It can be found as Example 4.4 in the paper.
using FrankWolfe
using LinearAlgebra
using LaTeXStrings
using Plots
using JuMP
const MOI = JuMP.MOI
import GLPK
n = Int(1e3)
k = 10000
xpi = rand(n);
total = sum(xpi);
const xp = xpi ./ total;
f(x) = norm(x - xp)^2
function grad!(storage, x)
@. storage = 2 * (x - xp)
return nothing
end
lmo_radius = 2.5
lmo = FrankWolfe.LpNormLMO{Float64,1}(lmo_radius)
x00 = FrankWolfe.compute_extreme_point(lmo, zeros(n))
gradient = collect(x00)
x_lmo, v, primal, dual_gap, trajectory_lmo = FrankWolfe.frank_wolfe(
f,
grad!,
lmo,
collect(copy(x00)),
max_iteration=k,
line_search=FrankWolfe.Shortstep(),
L=2,
print_iter=k / 10,
emphasis=FrankWolfe.memory,
verbose=true,
trajectory=true,
);([0.0006693418191903948, 0.00016738281711227416, 0.0014147527215670383, 0.0006978348226808663, 0.00023859491969484596, 0.0005571983967922555, 0.0008206571480693132, 0.0019813794462126655, 0.0003779858573667411, 0.0016963228025400573 … 0.001779980013239167, 0.00029354759260855686, 0.0016893449686318715, 0.0019424472694642494, 0.0002984082353810938, 0.0009046465027875706, 1.6697191693325082e-5, 0.00012402736014848354, 0.0011531198603070738, 0.00049081847624793], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 … 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 5.215061085115467e-12, 4.7272016962324233e-7, Any[(0, 0.0013551911877132648, -0.008703501589961277, 0.010058692777674542, 0.0), (1, 0.0013511440956974392, -0.008695153674626381, 0.010046297770323821, 0.998361694), (2, 0.001347106974356015, -0.00869893433807131, 0.010046041312427326, 0.998391494), (3, 0.0013430700617314154, -0.008693479035463714, 0.01003654909719513, 0.998399495), (4, 0.00133904077682195, -0.008671507350540855, 0.010010548127362805, 0.998406995), (5, 0.0013350323442026305, -0.00867116605667044, 0.01000619840087307, 0.998424895), (6, 0.0013310273968212651, -0.00866515003146539, 0.009996177428286655, 0.998430895), (7, 0.0013270304696851322, -0.008659080722665018, 0.00998611119235015, 0.998436795), (8, 0.0013230415909016343, -0.00864493726443207, 0.009967978855333704, 0.998441495), (9, 0.0013190671871327255, -0.008646800683276064, 0.00996586787040879, 0.998446995) … (9991, 5.3052324021564984e-12, -4.758219867742644e-7, 4.7582729200666654e-7, 1.097531426), (9992, 5.296163812311331e-12, -4.7575866805564483e-7, 4.7576396421945713e-7, 1.097536726), (9993, 5.28710215786267e-12, -4.7591363782962456e-7, 4.759189249317824e-7, 1.097542126), (9994, 5.278039461381216e-12, -4.7521464671593305e-7, 4.7521992475539444e-7, 1.097546826), (9995, 5.269000153177711e-12, -4.751142662554994e-7, 4.7511953525565253e-7, 1.097552026), (9996, 5.259971691460941e-12, -4.7436809924858333e-7, 4.7437335922027477e-7, 1.097556726), (9997, 5.2509598506826885e-12, -4.740344070983031e-7, 4.740396580581538e-7, 1.097562426), (9998, 5.241967010747507e-12, -4.743561218236692e-7, 4.7436136379067994e-7, 1.097566826), (9999, 5.232957164830443e-12, -4.7319413163207194e-7, 4.731993645892368e-7, 1.097571726), (10000, 5.223999152993996e-12, -4.723862566991199e-7, 4.723914806982729e-7, 1.097577526)])Create a MathOptInterface Optimizer and build the same linear constraints:
o = GLPK.Optimizer()
x = MOI.add_variables(o, n)
for xi in x
MOI.add_constraint(o, xi, MOI.GreaterThan(0.0))
end
MOI.add_constraint(
o,
MOI.ScalarAffineFunction(MOI.ScalarAffineTerm.(1.0, x), 0.0),
MOI.EqualTo(lmo_radius),
)
lmo_moi = FrankWolfe.MathOptLMO(o)
x, v, primal, dual_gap, trajectory_moi = FrankWolfe.frank_wolfe(
f,
grad!,
lmo_moi,
collect(copy(x00)),
max_iteration=k,
line_search=FrankWolfe.Shortstep(),
L=2,
print_iter=k / 10,
emphasis=FrankWolfe.memory,
verbose=true,
trajectory=true,
);([0.0006693418191903948, 0.00016738281711227416, 0.0014147527215670383, 0.0006978348226808663, 0.00023859491969484596, 0.0005571983967922555, 0.0008206571480693132, 0.0019813794462126655, 0.0003779858573667411, 0.0016963228025400573 … 0.001779980013239167, 0.00029354759260855686, 0.0016893449686318715, 0.0019424472694642494, 0.0002984082353810938, 0.0009046465027875706, 1.6697191693325082e-5, 0.00012402736014848354, 0.0011531198603070738, 0.00049081847624793], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 … 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 5.215061085115467e-12, 4.7272016962324233e-7, Any[(0, 0.0013551911877132648, -0.008703501589961277, 0.010058692777674542, 0.0), (1, 0.0013511440956974392, -0.008695153674626381, 0.010046297770323821, 0.053221807), (2, 0.001347106974356015, -0.00869893433807131, 0.010046041312427326, 0.05350351), (3, 0.0013430700617314154, -0.008693479035463714, 0.01003654909719513, 0.053704813), (4, 0.00133904077682195, -0.008671507350540855, 0.010010548127362805, 0.053893515), (5, 0.0013350323442026305, -0.00867116605667044, 0.01000619840087307, 0.054110817), (6, 0.0013310273968212651, -0.00866515003146539, 0.009996177428286655, 0.054274019), (7, 0.0013270304696851322, -0.008659080722665018, 0.00998611119235015, 0.054441621), (8, 0.0013230415909016343, -0.00864493726443207, 0.009967978855333704, 0.054607923), (9, 0.0013190671871327255, -0.008646800683276064, 0.00996586787040879, 0.054796325) … (9991, 5.3052324021564984e-12, -4.758219867742644e-7, 4.7582729200666654e-7, 1.840653503), (9992, 5.296163812311331e-12, -4.7575866805564483e-7, 4.7576396421945713e-7, 1.840839505), (9993, 5.28710215786267e-12, -4.7591363782962456e-7, 4.759189249317824e-7, 1.841029007), (9994, 5.278039461381216e-12, -4.7521464671593305e-7, 4.7521992475539444e-7, 1.841213109), (9995, 5.269000153177711e-12, -4.751142662554994e-7, 4.7511953525565253e-7, 1.841383111), (9996, 5.259971691460941e-12, -4.7436809924858333e-7, 4.7437335922027477e-7, 1.841549913), (9997, 5.2509598506826885e-12, -4.740344070983031e-7, 4.740396580581538e-7, 1.841730015), (9998, 5.241967010747507e-12, -4.743561218236692e-7, 4.7436136379067994e-7, 1.841899017), (9999, 5.232957164830443e-12, -4.7319413163207194e-7, 4.731993645892368e-7, 1.842067619), (10000, 5.223999152993996e-12, -4.723862566991199e-7, 4.723914806982729e-7, 1.842235721)])Alternatively, we can use one of the modelling interfaces based on MOI to formulate the LP. The following example builds the same set of constraints using JuMP:
m = JuMP.Model(GLPK.Optimizer)
@variable(m, y[1:n] ≥ 0)
@constraint(m, sum(y) == lmo_radius)
lmo_jump = FrankWolfe.MathOptLMO(m.moi_backend)
x, v, primal, dual_gap, trajectory_jump = FrankWolfe.frank_wolfe(
f,
grad!,
lmo_jump,
collect(copy(x00)),
max_iteration=k,
line_search=FrankWolfe.Shortstep(),
L=2,
print_iter=k / 10,
emphasis=FrankWolfe.memory,
verbose=true,
trajectory=true,
);
x_lmo, v, primal, dual_gap, trajectory_lmo_blas = FrankWolfe.frank_wolfe(
f,
grad!,
lmo,
x00,
max_iteration=k,
line_search=FrankWolfe.Shortstep(),
L=2,
print_iter=k / 10,
emphasis=FrankWolfe.blas,
verbose=true,
trajectory=true,
);
x, v, primal, dual_gap, trajectory_jump_blas = FrankWolfe.frank_wolfe(
f,
grad!,
lmo_jump,
x00,
max_iteration=k,
line_search=FrankWolfe.Shortstep(),
L=2,
print_iter=k / 10,
emphasis=FrankWolfe.blas,
verbose=true,
trajectory=true,
);
iteration_list = [[x[1] + 1 for x in trajectory_lmo], [x[1] + 1 for x in trajectory_moi]]
time_list = [[x[5] for x in trajectory_lmo], [x[5] for x in trajectory_moi]]
primal_gap_list = [[x[2] for x in trajectory_lmo], [x[2] for x in trajectory_moi]]
dual_gap_list = [[x[4] for x in trajectory_lmo], [x[4] for x in trajectory_moi]]
label = [L"\textrm{Closed-form LMO}", L"\textrm{MOI LMO}"]
FrankWolfe.plot_results(
[primal_gap_list, primal_gap_list, dual_gap_list, dual_gap_list],
[iteration_list, time_list, iteration_list, time_list],
label,
["", "", L"\textrm{Iteration}", L"\textrm{Time}"],
[L"\textrm{Primal Gap}", "", L"\textrm{Dual Gap}", ""],
xscalelog=[:log, :identity, :log, :identity],
yscalelog=[:log, :log, :log, :log],
legend_position=[:bottomleft, nothing, nothing, nothing]
)
plot!(size=(3000, 2000), legendfontsize=30, annotationfontsize=30)
Polynomial Regression
The following example features the LMO for polynomial regression on the $\ell_1$ norm ball. Given input/output pairs $\{x_i,y_i\}_{i=1}^N$ and sparse coefficients $c_j$, where
\[y_i=\sum_{j=1}^m c_j f_j(x_i)\]
and $f_j: \mathbb{R}^n\to\mathbb{R}$, the task is to recover those $c_j$ that are non-zero alongside their corresponding values. Under certain assumptions, this problem can be convexified into
\[\min_{c\in\mathcal{C}}||y-Ac||^2\]
for a convex set $\mathcal{C}$. It can also be found as example 4.1 in the paper. In order to evaluate the polynomial, we generate a total of 1000 data points $\{x_i\}_{i=1}^N$ from the standard multivariate Gaussian, with which we will compute the output variables $\{y_i\}_{i=1}^N$. Before evaluating the polynomial, these points will be contaminated with noise drawn from a standard multivariate Gaussian. We run the away_frank_wolfe and blended_conditional_gradient algorithms, and compare them to Projected Gradient Descent using a smoothness estimate. We will evaluate the output solution on test points drawn in a similar manner as the training points.
using FrankWolfe
using LinearAlgebra
import Random
using MultivariatePolynomials
using DynamicPolynomials
using Plots
using LaTeXStrings
const N = 15
DynamicPolynomials.@polyvar X[1:15]
const max_degree = 4
coefficient_magnitude = 10
noise_magnitude = 1
const var_monomials = MultivariatePolynomials.monomials(X, 0:max_degree)
Random.seed!(42)
const all_coeffs = map(var_monomials) do m
d = MultivariatePolynomials.degree(m)
return coefficient_magnitude * rand() .* (rand() .> 0.95 * d / max_degree)
end
const true_poly = dot(all_coeffs, var_monomials)
const training_data = map(1:500) do _
x = 0.1 * randn(N)
y = MultivariatePolynomials.subs(true_poly, Pair(X, x)) + noise_magnitude * randn()
return (x, y.a[1])
end
const extended_training_data = map(training_data) do (x, y)
x_ext = getproperty.(MultivariatePolynomials.subs.(var_monomials, X => x), :α)
return (x_ext, y)
end
const test_data = map(1:1000) do _
x = 0.4 * randn(N)
y = MultivariatePolynomials.subs(true_poly, Pair(X, x)) + noise_magnitude * randn()
return (x, y.a[1])
end
const extended_test_data = map(test_data) do (x, y)
x_ext = getproperty.(MultivariatePolynomials.subs.(var_monomials, X => x), :α)
return (x_ext, y)
end
function f(coefficients)
return 0.5 / length(extended_training_data) * sum(extended_training_data) do (x, y)
return (dot(coefficients, x) - y)^2
end
end
function f_test(coefficients)
return 0.5 / length(extended_test_data) * sum(extended_test_data) do (x, y)
return (dot(coefficients, x) - y)^2
end
end
function coefficient_errors(coeffs)
return 0.5 * sum(eachindex(all_coeffs)) do idx
return (all_coeffs[idx] - coeffs[idx])^2
end
end
function grad!(storage, coefficients)
storage .= 0
for (x, y) in extended_training_data
p_i = dot(coefficients, x) - y
@. storage += x * p_i
end
storage ./= length(training_data)
return nothing
end
function build_callback(trajectory_arr)
return function callback(state)
return push!(
trajectory_arr,
(Tuple(state)[1:5]..., f_test(state.x), coefficient_errors(state.x)),
)
end
end
gradient = similar(all_coeffs)
max_iter = 100_000
random_initialization_vector = rand(length(all_coeffs))
lmo = FrankWolfe.LpNormLMO{1}(0.95 * norm(all_coeffs, 1))
# Estimating smoothness parameter
num_pairs = 10000
L_estimate = -Inf
gradient_aux = similar(gradient)
function projnorm1(x, τ)
n = length(x)
if norm(x, 1) ≤ τ
return x
end
u = abs.(x)
# simplex projection
bget = false
s_indices = sortperm(u, rev=true)
tsum = zero(τ)
@inbounds for i in 1:n-1
tsum += u[s_indices[i]]
tmax = (tsum - τ) / i
if tmax ≥ u[s_indices[i+1]]
bget = true
break
end
end
if !bget
tmax = (tsum + u[s_indices[n]] - τ) / n
end
@inbounds for i in 1:n
u[i] = max(u[i] - tmax, 0)
u[i] *= sign(x[i])
end
return u
end
for i in 1:num_pairs
global L_estimate
x = compute_extreme_point(lmo, randn(size(all_coeffs)))
y = compute_extreme_point(lmo, randn(size(all_coeffs)))
grad!(gradient, x)
grad!(gradient_aux, y)
new_L = norm(gradient - gradient_aux) / norm(x - y)
if new_L > L_estimate
L_estimate = new_L
end
endWe can now perform projected gradient descent:
xgd = FrankWolfe.compute_extreme_point(lmo, random_initialization_vector)
training_gd = Float64[]
test_gd = Float64[]
coeff_error = Float64[]
time_start = time_ns()
gd_times = Float64[]
for iter in 1:max_iter
global xgd
grad!(gradient, xgd)
xgd = projnorm1(xgd - gradient / L_estimate, lmo.right_hand_side)
push!(training_gd, f(xgd))
push!(test_gd, f_test(xgd))
push!(coeff_error, coefficient_errors(xgd))
push!(gd_times, (time_ns() - time_start) * 1e-9)
end
x00 = FrankWolfe.compute_extreme_point(lmo, random_initialization_vector)
x0 = deepcopy(x00)
trajectory_lafw = []
callback = build_callback(trajectory_lafw)
x_lafw, v, primal, dual_gap, _ = FrankWolfe.away_frank_wolfe(
f,
grad!,
lmo,
x0,
max_iteration=max_iter,
line_search=FrankWolfe.Adaptive(),
print_iter=max_iter ÷ 10,
emphasis=FrankWolfe.memory,
verbose=false,
lazy=true,
gradient=gradient,
callback=callback,
L=L_estimate,
)
trajectory_bcg = []
callback = build_callback(trajectory_bcg)
x0 = deepcopy(x00)
x_bcg, v, primal, dual_gap, _ = FrankWolfe.blended_conditional_gradient(
f,
grad!,
lmo,
x0,
max_iteration=max_iter,
line_search=FrankWolfe.Adaptive(),
print_iter=max_iter ÷ 10,
emphasis=FrankWolfe.memory,
verbose=false,
weight_purge_threshold=1e-10,
callback=callback,
L=L_estimate,
)
x0 = deepcopy(x00)
trajectory_lafw_ref = []
callback = build_callback(trajectory_lafw_ref)
_, _, primal_ref, _, _ = FrankWolfe.away_frank_wolfe(
f,
grad!,
lmo,
x0,
max_iteration=2 * max_iter,
line_search=FrankWolfe.Adaptive(),
print_iter=max_iter ÷ 10,
emphasis=FrankWolfe.memory,
verbose=false,
lazy=true,
gradient=gradient,
callback=callback,
L=L_estimate,
)
iteration_list = [
[x[1] + 1 for x in trajectory_lafw],
[x[1] + 1 for x in trajectory_bcg],
collect(eachindex(training_gd)),
]
time_list = [
[x[5] for x in trajectory_lafw],
[x[5] for x in trajectory_bcg],
gd_times,
]
primal_list = [
[x[2] - primal_ref for x in trajectory_lafw],
[x[2] - primal_ref for x in trajectory_bcg],
[x - primal_ref for x in training_gd],
]
test_list = [
[x[6] for x in trajectory_lafw],
[x[6] for x in trajectory_bcg],
test_gd,
]
label = [L"\textrm{L-AFW}", L"\textrm{BCG}", L"\textrm{GD}"]
coefficient_error_values = [
[x[7] for x in trajectory_lafw],
[x[7] for x in trajectory_bcg],
coeff_error,
]
FrankWolfe.plot_results(
[primal_list, primal_list, test_list, test_list],
[iteration_list, time_list, iteration_list, time_list],
label,
[L"\textrm{Iteration}", L"\textrm{Time}", L"\textrm{Iteration}", L"\textrm{Time}"],
[L"\textrm{Primal Gap}", L"\textrm{Primal Gap}", L"\textrm{Test loss}", L"\textrm{Test loss}"],
xscalelog=[:log, :identity, :log, :identity],
legend_position=[:bottomleft, nothing, nothing, nothing],
)
plot!(size=(3000, 2000), legendfontsize=30, annotationfontsize=30)